In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [75]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
In [76]:
test = pd.read_csv("test.csv")
test.head()
Out[76]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [77]:
train = pd.read_csv("train (1).csv")
train.head()
Out[77]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [78]:
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
In [79]:
test.describe()
Out[79]:
PassengerId Pclass Age SibSp Parch Fare
count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000
mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188
std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576
min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000
25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800
50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200
75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000
max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200
In [80]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
In [81]:
train.describe()
Out[81]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [82]:
df = pd.concat([train,test], ignore_index=True)
In [83]:
df.head()
Out[83]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0.0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1.0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1.0 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1.0 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0.0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

1. Understanding Dataset¶

Key columns:

Column Description
PassengerId Unique ID
Survived 0 = No, 1 = Yes
Pclass Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
Name Passenger name
Sex Male/Female
Age Age in years
SibSp of siblings/spouses aboard
Parch of parents/children aboard
Ticket Ticket number
Fare Passenger fare
Cabin Cabin number
Embarked Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
In [84]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB
In [85]:
df.describe()
Out[85]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 1309.000000 891.000000 1309.000000 1046.000000 1309.000000 1309.000000 1308.000000
mean 655.000000 0.383838 2.294882 29.881138 0.498854 0.385027 33.295479
std 378.020061 0.486592 0.837836 14.413493 1.041658 0.865560 51.758668
min 1.000000 0.000000 1.000000 0.170000 0.000000 0.000000 0.000000
25% 328.000000 0.000000 2.000000 21.000000 0.000000 0.000000 7.895800
50% 655.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 982.000000 1.000000 3.000000 39.000000 1.000000 0.000000 31.275000
max 1309.000000 1.000000 3.000000 80.000000 8.000000 9.000000 512.329200

2. Feature Engineering¶

  1. Title
  2. Age group
  3. Family Size
  4. Family Type
  5. Individual Fare
  6. Deck
In [86]:
df["Name"]
Out[86]:
Name
0 Braund, Mr. Owen Harris
1 Cumings, Mrs. John Bradley (Florence Briggs Th...
2 Heikkinen, Miss. Laina
3 Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 Allen, Mr. William Henry
... ...
1304 Spector, Mr. Woolf
1305 Oliva y Ocana, Dona. Fermina
1306 Saether, Mr. Simon Sivertsen
1307 Ware, Mr. Frederick
1308 Peter, Master. Michael J

1309 rows × 1 columns


In [87]:
df["Title"] =  df["Name"].str.split(',').str[1].str.strip( ).str.split(".").str[0]
In [88]:
df['Title'] = np.where((df["Title"] == "Ms") | (df["Title"] == "Mlle"), "Miss", df["Title"])
df['Title'] = np.where(df["Title"] == "Mme", "Mrs", df["Title"])
In [89]:
df["Title"].value_counts()
Out[89]:
count
Title
Mr 757
Miss 264
Mrs 198
Master 61
Rev 8
Dr 8
Col 4
Major 2
Don 1
Lady 1
Sir 1
Capt 1
the Countess 1
Jonkheer 1
Dona 1

In [90]:
df["Family Size"] = df["SibSp"] + df["Parch"] + 1
In [91]:
df["Family Size"].value_counts()
Out[91]:
count
Family Size
1 790
2 235
3 159
4 43
6 25
5 22
7 16
11 11
8 8

In [92]:
df["Family Type"] = np.where(df["Family Size"] == 1, "Solo",
                             np.where((df["Family Size"] >1) & (df["Family Size"] < 5), "Small",
                                      np.where((df["Family Size"] > 4)& (df["Family Size"] < 7 ),"Medium", "Large")))
In [93]:
df["Family Type"].value_counts()
Out[93]:
count
Family Type
Solo 790
Small 437
Medium 47
Large 35

In [94]:
df["Individual Fare"] = df["Fare"]/df["Family Size"]
In [95]:
df["Deck"] = df["Cabin"].str[0]
df["Deck"].value_counts()
Out[95]:
count
Deck
C 94
B 65
D 46
E 41
A 22
F 21
G 5
T 1

In [96]:
df["Age Group"] = pd.cut(df["Age"], bins=[0,2,17,30,45,100], labels=["Baby", "Child", "Young Adult", "Middle Aged Adult", "Senior"])
df["Age Group"].value_counts()
Out[96]:
count
Age Group
Young Adult 455
Middle Aged Adult 282
Senior 155
Child 120
Baby 34

3. Data Analysis¶

  • Univariate Analysis
  • Bivariate Analysis
  • Multivariate Analysis

a. Univariate Analysis:¶

methods for statistics & distribution¶

In [97]:
def stats(df, col):
  if pd.api.types.is_numeric_dtype(df[col]):

    print(col, "- Numerical Column")
    print()

    null_count = df[col].isnull().sum().item()
    print("\nNull values are: ",null_count)

    null_percent = round((null_count/df.shape[0])*100,2)
    print("\nNull values percentage is: ",null_percent)

    mean = df[col].mean()
    print("\nMean is: ",mean)

    median = df[col].median()
    print("\nMedian is: ",median)

    mode = df[col].mode()[0]
    print("\nMode is: ",mode)

    std = df[col].std()
    print("\nStandard Deviation is: ",std)

    var = df[col].var()
    print("\nVariance is: ",var)

    skew = df[col].skew()
    print("\nSkewness is: ",skew)

    kurt = df[col].kurt()
    print("\nKurtosis is: ",kurt)

  else:
    print(col,"- Categorical Column")
    print()
    null_count = df[col].isnull().sum().item()
    print("\nNull values are: ",df[col].isnull().sum().item())

    null_percent = round((null_count/df.shape[0])*100,2)
    print("\nNull values percentage is: ",null_percent)

    mode = df[col].mode()[0]
    print("\nMode is: ",mode)

    print("\nValue Counts are: \n")
    print(df[col].value_counts())
In [98]:
def num_cat_plots(df, col):

    # numerical column
    if pd.api.types.is_numeric_dtype(df[col]):
        print()
        print("Numerical column Analysis : -\n")
        print()
        stats(df,col)
        print()
        plt.figure(figsize=(17,5))
        sns.set_style("whitegrid")
        sns.set_palette("Set2")

        plt.subplot(1,3,1)
        sns.histplot(x=df[col])
        plt.title("Histogram")

        plt.subplot(1,3,2)
        sns.kdeplot(x=df[col],fill=True)
        plt.title("KDE Plot")

        plt.subplot(1,3,3)
        sns.boxplot(x=df[col])
        plt.title("Boxplot")

        plt.show()

    else:
        print()
        # Categorical column
        print("Categorical column Analysis : -\n")
        stats(df,col)

        #count plot
        plt.figure(figsize=(12,5))
        sns.countplot(data=df, x=col)
        plt.title(f"Count of different {col}")
        plt.show()

        # Pie chart
        pie_df = df[col].value_counts().reset_index()
        pie_df.columns = ["Category", "Count"]
        fig = px.pie(pie_df,
            names = "Category",
            values = "Count" ,
            title = (f"Proportion of different {col}"))
        fig.update_layout(
            title_x = 0.5,
            title_font = dict(size = 20 , color = "Red")
        )
        fig.show()

Analysis of all columns¶

In [99]:
all_columns = ['Age','Fare', 'Individual Fare',"Survived", 'Pclass', 'SibSp', 'Parch','Embarked', 'Family Size', 'Family Type', 'Deck','Sex','Age Group','Title']
all_cat_columns = ["Survived", 'Pclass', 'SibSp', 'Parch','Embarked', 'Family Size', 'Family Type', 'Deck','Sex','Title']

df1 = df.copy()
for i in all_columns :
  if i in all_cat_columns:
    df1[i] = df[i].astype('category')
    df1[i].dtype
  num_cat_plots(df1,i)
Numerical column Analysis : -


Age - Numerical Column


Null values are:  263

Null values percentage is:  20.09

Mean is:  29.881137667304014

Median is:  28.0

Mode is:  24.0

Standard Deviation is:  14.413493211271334

Variance is:  207.74878655136482

Skewness is:  0.40767455974362266

Kurtosis is:  0.1469476357378139

No description has been provided for this image
Numerical column Analysis : -


Fare - Numerical Column


Null values are:  1

Null values percentage is:  0.08

Mean is:  33.29547928134557

Median is:  14.4542

Mode is:  8.05

Standard Deviation is:  51.75866823917414

Variance is:  2678.959737892894

Skewness is:  4.367709134122922

Kurtosis is:  27.027986349442294

No description has been provided for this image
Numerical column Analysis : -


Individual Fare - Numerical Column


Null values are:  1

Null values percentage is:  0.08

Mean is:  20.51821514307558

Median is:  8.512483333333332

Mode is:  13.0

Standard Deviation is:  35.774336893842424

Variance is:  1279.8031801941354

Skewness is:  6.683189172409639

Kurtosis is:  66.46361442187475

No description has been provided for this image
Categorical column Analysis : -

Survived - Categorical Column


Null values are:  418

Null values percentage is:  31.93

Mode is:  0.0

Value Counts are: 

Survived
0.0    549
1.0    342
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

Pclass - Categorical Column


Null values are:  0

Null values percentage is:  0.0

Mode is:  3

Value Counts are: 

Pclass
3    709
1    323
2    277
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

SibSp - Categorical Column


Null values are:  0

Null values percentage is:  0.0

Mode is:  0

Value Counts are: 

SibSp
0    891
1    319
2     42
4     22
3     20
8      9
5      6
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

Parch - Categorical Column


Null values are:  0

Null values percentage is:  0.0

Mode is:  0

Value Counts are: 

Parch
0    1002
1     170
2     113
3       8
4       6
5       6
6       2
9       2
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

Embarked - Categorical Column


Null values are:  2

Null values percentage is:  0.15

Mode is:  S

Value Counts are: 

Embarked
S    914
C    270
Q    123
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

Family Size - Categorical Column


Null values are:  0

Null values percentage is:  0.0

Mode is:  1

Value Counts are: 

Family Size
1     790
2     235
3     159
4      43
6      25
5      22
7      16
11     11
8       8
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

Family Type - Categorical Column


Null values are:  0

Null values percentage is:  0.0

Mode is:  Solo

Value Counts are: 

Family Type
Solo      790
Small     437
Medium     47
Large      35
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

Deck - Categorical Column


Null values are:  1014

Null values percentage is:  77.46

Mode is:  C

Value Counts are: 

Deck
C    94
B    65
D    46
E    41
A    22
F    21
G     5
T     1
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

Sex - Categorical Column


Null values are:  0

Null values percentage is:  0.0

Mode is:  male

Value Counts are: 

Sex
male      843
female    466
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

Age Group - Categorical Column


Null values are:  263

Null values percentage is:  20.09

Mode is:  Young Adult

Value Counts are: 

Age Group
Young Adult          455
Middle Aged Adult    282
Senior               155
Child                120
Baby                  34
Name: count, dtype: int64
No description has been provided for this image
Categorical column Analysis : -

Title - Categorical Column


Null values are:  0

Null values percentage is:  0.0

Mode is:  Mr

Value Counts are: 

Title
Mr              757
Miss            264
Mrs             198
Master           61
Dr                8
Rev               8
Col               4
Major             2
Lady              1
Dona              1
Don               1
Capt              1
Jonkheer          1
Sir               1
the Countess      1
Name: count, dtype: int64
No description has been provided for this image

b. Bivariate Analysis:¶

i. Categorical - Categorical Analysis -¶

Survival - Other categorical column analysis¶

Method for survival rate , survival count & graphs¶
In [168]:
## survival rate
def survival_rate(df, *col):

  if not col:
        overall_rate = df["Survived"].mean()
        return pd.DataFrame({"Overall_Survival_rate": [overall_rate]})

  rate = df.groupby(list(col))["Survived"].mean().reset_index().rename(
        columns={"Survived": "Survival_rate"}
    )
  return rate

## survival count
def survival_count(df, *col):

    if not col:
        overall_counts = df["Survived"].value_counts().reset_index()
        overall_counts.columns = ['Survived', 'Count']
        return overall_counts

    grouping_list = list(col) + ["Survived"]
    count = df.groupby(grouping_list).size().reset_index(name="Count")

    return count

## Graphs

def cat_survial(df,col):

  temp2 = survival_rate(df,col)
  sns.barplot(data=temp2, x=col, y="Survival_rate")
  plt.title(f"Survival rate across {col}")
  plt.xticks(rotation=45)
  plt.show()

  xx= survival_count(df,col)
  xx["Rate"] = xx["Count"]/xx.groupby(col)["Count"].transform("sum")
  sns.barplot(data=xx , x = "Survived", y="Rate", hue = col)
  plt.title(f"Survival & Non Survival Rate across {col}")
  plt.xticks(rotation=45)
  plt.legend(title=col, bbox_to_anchor=(1.05, 1), loc='upper left')
  plt.tight_layout()
  plt.show()

  temp1 = survival_count(df,col)
  fig = px.bar(temp1, x = "Survived", y = "Count", color = col , barmode = "stack",
        height=500, width=700, title= f"Survival count of {col}")
  fig.update_layout(
        title_x = 0.5,
        title_font = dict(size = 20 , color = "Green")
    )
  fig.show()
Analysis on survival rate + survival count of all categorical columns¶
In [101]:
for i in all_cat_columns:
  if i != "Survived":
    cat_survial(df,i)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

P class - Other categorical column analysis¶

Methods for proportion of pclass across each categorical col¶
In [102]:
def pclass_cat_plot(df, col):
    # group by Pclass and categorical column
    xx = (
        df.groupby(["Pclass", col])
          .size()
          .reset_index(name="Count")
          .sort_values(by="Count", ascending=False)
    )

    # proportion within each Pclass
    xx["Proportion %"] = (
        xx["Count"] / xx.groupby("Pclass")["Count"].transform("sum")
    ) * 100

    # barplot (proportion per category)
    plt.figure(figsize=(8,5))
    sns.barplot(data=xx, x=col, y="Proportion %", hue="Pclass")
    plt.title(f"Pclass-wise Proportion Distribution for {col}")
    plt.xticks(rotation=45)
    plt.show()

    # Stacked bar chart
    pivot = xx.pivot(index="Pclass", columns=col, values="Proportion %").fillna(0)
    pivot.plot(kind="bar", stacked=True, figsize=(8,5))
    plt.ylabel("Proportion %")
    plt.title(f"Stacked Bar – Pclass vs {col}")
    plt.xticks(rotation=0)
    plt.legend(title=col, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
Analysis on proportion of pclass across each categorical col¶
In [103]:
for i in all_cat_columns:
  if i != "Pclass":
    pclass_cat_plot(df,i)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Sex - Other categorical column analysis¶

In [104]:
temp_df = df.groupby(["Family Type","Sex"]).size().reset_index(name="Count")
fig = px.bar(temp_df, x="Family Type", y='Count' , color = "Sex", title="Count of Male/Female across family types" )
fig.update_layout(
    title_x = 0.5,
    title_font = dict(size = 18 , color = "Green")
)
In [105]:
temp_df = df.groupby(["Deck","Sex"]).size().reset_index(name="Count")
sns.barplot(data = temp_df, x="Deck", y='Count' , hue = "Sex" )
plt.title("Count of Male/Female in each deck")
plt.show()
No description has been provided for this image
In [106]:
temp_df = df.groupby(["Embarked","Sex"]).size().reset_index(name="Count")
sns.barplot(data = temp_df, x="Embarked", y='Count' , hue = "Sex" )
plt.title("Count of Male/Female in each Embarked")
plt.show()
No description has been provided for this image

ii. Numerical - Numerical Analysis -¶

In [107]:
def num_num_analysis(df, col1, col2):

    # Scatter Plot
    plt.figure(figsize=(7,5))
    sns.scatterplot(data=df, x=col1, y=col2)
    plt.title(f"Scatter Plot: {col1} vs {col2}")
    plt.show()

    # Joint Plot

    sns.jointplot(data=df, x=col1, y=col2, kind="scatter", height=7)
    plt.suptitle(f"Joint Plot: {col1} vs {col2}", y=1.02)
    plt.show()

    # KDE Plot
    plt.figure(figsize=(7,5))
    sns.kdeplot(data=df, x=col1, y=col2)
    plt.title(f"KDE Plot: {col1} vs {col2}")
    plt.show()
In [108]:
num_num_analysis(df,'Age','Fare')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [109]:
num_num_analysis(df,'Age','Individual Fare')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

iii. Numerical - categorical Analysis -¶

In [110]:
def num_cat_analysis(df, num_col, cat_col, agg="mean"):

    # Boxplot
    plt.figure(figsize=(8,5))
    sns.boxplot(data=df, x=cat_col, y=num_col)
    plt.title(f"Boxplot of {num_col} across {cat_col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Violin Plot
    plt.figure(figsize=(8,5))
    sns.violinplot(data=df, x=cat_col, y=num_col)
    plt.title(f"Violin Plot of {num_col} across {cat_col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Aggregated Bar Plot: Mean / Median
    temp = df.groupby(cat_col)[num_col].agg(agg).reset_index()
    temp.rename(columns={num_col: f"{agg}_{num_col}"}, inplace=True)

    plt.figure(figsize=(7,4))
    sns.barplot(data=temp, x=cat_col, y=f"{agg}_{num_col}", palette="viridis")
    plt.title(f"{agg.title()} {num_col} by {cat_col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
In [111]:
for i in ['Survived',"Pclass","Sex","Age Group","Embarked","Family Type"]:
  print("Fare vs other categorical columns")
  num_cat_analysis(df, 'Fare', i, agg="mean")
  print("Age vs other categorical columns")
  num_cat_analysis(df, 'Age', i, agg="mean")
  print("Individual Fare vs other categorical columns")
  num_cat_analysis(df, 'Individual Fare', i, agg="mean")
Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Age vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Individual Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Age vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Individual Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Age vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Individual Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Age vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Individual Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Age vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Individual Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Age vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Individual Fare vs other categorical columns
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

c. Multivariate Analysis:¶

Pclass + Sex + Survived¶
In [112]:
temp = df.groupby(["Pclass","Sex","Survived"]).size().reset_index(name="Count")

#Clustered barchart
plt.figure(figsize=(8,5))
sns.barplot(data=df, x="Pclass", y="Survived", hue="Sex")
plt.title("Survival Rate by Pclass and Sex")
plt.show()

# sunburst
px.sunburst(temp , path = ["Pclass", "Sex","Survived"], color = "Count",
            height = 500, width = 500, title = "Proportion of survivals across Pclass , Sex")
No description has been provided for this image

Age + Sex + Survived¶

In [113]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Sex", y="Age", hue="Survived")
plt.title("Age Distribution by Sex and Survival")
plt.show()

g = sns.FacetGrid(df, col="Sex", hue="Survived", height=4)
g.map(sns.kdeplot, "Age", fill=True, alpha=0.6)
g.add_legend()
plt.show()
No description has been provided for this image
No description has been provided for this image

Fare + Pclass + Survived¶

In [114]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="Fare", y="Pclass", hue="Survived", alpha=0.7)
plt.title("Fare vs Pclass by Survival")
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Pclass", y="Fare", hue="Survived")
plt.title("Fare Distribution by Pclass and Survival")
plt.show()
No description has been provided for this image
No description has been provided for this image

Embarked + Pclass + Survived¶

In [115]:
temp = df.groupby(["Embarked","Pclass","Survived"]).size().reset_index(name="Count")
fig = px.sunburst(temp , path = ["Embarked","Pclass","Count"],color = "Count",
                  height = 500, width = 500, title = "Count of passengers across different pclass , embark")
fig.show()

plt.figure(figsize=(8,5))
sns.barplot(data=df, x="Embarked", y="Survived", hue="Pclass")
plt.title("Survival Rate by Embarked Port and Pclass")
plt.show()
No description has been provided for this image

FamilySize + Pclass + Survived¶

In [116]:
plt.figure(figsize=(8,5))
sns.barplot(data=df, x="Family Size", y="Survived", hue="Pclass")
plt.title("Survival Rate by Family Size and Pclass")
plt.show()
No description has been provided for this image

Age + Fare + Survived¶

In [117]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="Age", y="Fare", hue="Survived", alpha=0.7)
plt.title("Age vs Fare by Survival")
plt.show()

sns.jointplot(data=df, x="Age", y="Fare", hue="Survived", kind="kde")
plt.suptitle("KDE Joint Distribution – Age & Fare", y=1.02)
plt.show()
No description has been provided for this image
No description has been provided for this image

Age + Pclass + Survived¶

In [118]:
def age_pclass_survived_plots(df):
    plt.figure(figsize=(14,5))

    # Boxplot: Age vs Pclass split by Survived
    sns.boxplot(data=df, x="Pclass", y="Age", hue="Survived", palette="Set2")
    plt.title("Age Distribution Across Pclass (Survived vs Not)")
    plt.show()

    # Swarm Plot
    plt.figure(figsize=(14,5))
    sns.swarmplot(data=df, x="Pclass", y="Age", hue="Survived", palette="cool", dodge=True)
    plt.title("Swarm Plot: Age vs Pclass by Survival")
    plt.show()

    # Histogram Plot
    plt.figure(figsize=(14,5))
    sns.histplot(data=df, x="Age", hue="Pclass", multiple="stack", kde=True)
    plt.title("Stacked Age Histogram by Pclass")
    plt.show()

age_pclass_survived_plots(df)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Survived + Deck + Pclass¶

In [119]:
def survived_deck_pclass_plots(df):

    temp = df.dropna(subset=["Deck"])

    # grouped barchart
    xx = temp.groupby(["Deck", "Pclass", "Survived"]).size().reset_index(name="Count")
    xx["Proportion Percentage"] = xx["Count"] / xx.groupby(["Deck", "Pclass"])["Count"].transform("sum")

    plt.figure(figsize=(12,6))
    sns.barplot(data=xx, x="Deck", y="Proportion Percentage", hue="Survived", palette="Set2")
    plt.title("Proportion Percentage of Survival by Deck")
    plt.show()

    # heatmap
    heat = temp.groupby(["Deck", "Pclass"])["Survived"].mean().unstack()
    plt.figure(figsize=(10,6))
    sns.heatmap(heat, annot=True, cmap="YlGnBu")
    plt.title("Survival Rate Heatmap (Deck vs Pclass)")
    plt.show()

survived_deck_pclass_plots(df)
No description has been provided for this image
No description has been provided for this image

4. Handling Null Values :¶

Methods for null value filling¶

In [120]:
## Checking fill values

def fill_methods(df, col):

    s = df[col]
    filled = {}

    if pd.api.types.is_numeric_dtype(df[col]):  # NUMERIC
        mean_val = s.mean()
        median_val = s.median()
        mode_val = s.mode()[0]

        non_null = s.dropna()
        rand_one = np.random.choice(non_null) # 1 random value from col
        rand_many = np.random.choice(non_null, size=s.isna().sum()) # multiple random values from col

        # null values filled with different methods
        filled["mean"]   = s.fillna(mean_val)
        filled["median"] = s.fillna(median_val)
        filled["mode"]   = s.fillna(mode_val)
        filled["rand1"]  = s.fillna(rand_one)
        filled["zero"]   = s.fillna(0) # filling with arbitrary value 0

        r = s.copy()
        r[r.isna()] = rand_many # filling with multiplr random values
        filled["rand-many"] = r

    else:  # CATEGORICAL
        mode_val = s.mode()[0]
        non_null = s.dropna()

        # Create a temporary series to add new categories for imputation
        temp_s = s.copy()
        if "None" not in temp_s.cat.categories:
            temp_s = temp_s.cat.add_categories("None")
        if "Unknown" not in temp_s.cat.categories:
            temp_s = temp_s.cat.add_categories("Unknown")

        filled["mode"]     = temp_s.fillna(mode_val)
        filled["none"]     = temp_s.fillna("None")
        filled["unknown"]  = temp_s.fillna("Unknown")
        filled["rand-cat"] = temp_s.fillna(np.random.choice(non_null))

    temp = pd.DataFrame(filled)

    return filled, temp

# Plotting imputation methods

def plots(original, filled_dict, colname):

    is_numeric = pd.api.types.is_numeric_dtype(original)

    for method, series in filled_dict.items():
        # KDE Plot (only for numeric columns)
        if is_numeric:
            plt.figure(figsize=(10,5))
            plt.title(f"KDE Plot Comparison for {colname} — {method}")
            original.dropna().plot(kind='kde', label='Original')
            series.plot(kind='kde', label=method)
            plt.legend()
            plt.show()

        # Histogram (works for both numeric and categorical)
        plt.figure(figsize=(10,5))
        plt.title(f"Histogram Comparison for {colname} — {method}")
        plt.hist(original.dropna(), alpha=0.5, label="Original")
        plt.hist(series, alpha=0.5, label=method)
        plt.legend()
        plt.show()

        # Boxplot (only for numeric columns)
        if is_numeric:
            plt.figure(figsize=(6,4))
            plt.title(f"Boxplot Comparison for {colname} — {method}")
            plt.boxplot([original.dropna(), series], labels=["Original", method])
            plt.show()

a. Age:-¶

In [121]:
## finding best imputation method
fill_dict , temp_df = fill_methods(df, "Age")
temp_df["Original"] = df["Age"]
temp_df
Out[121]:
mean median mode rand1 zero rand-many Original
0 22.000000 22.0 22.0 22.0 22.0 22.0 22.0
1 38.000000 38.0 38.0 38.0 38.0 38.0 38.0
2 26.000000 26.0 26.0 26.0 26.0 26.0 26.0
3 35.000000 35.0 35.0 35.0 35.0 35.0 35.0
4 35.000000 35.0 35.0 35.0 35.0 35.0 35.0
... ... ... ... ... ... ... ...
1304 29.881138 28.0 24.0 34.0 0.0 36.0 NaN
1305 39.000000 39.0 39.0 39.0 39.0 39.0 39.0
1306 38.500000 38.5 38.5 38.5 38.5 38.5 38.5
1307 29.881138 28.0 24.0 34.0 0.0 24.0 NaN
1308 29.881138 28.0 24.0 34.0 0.0 33.0 NaN

1309 rows × 7 columns

In [122]:
## plotting graph for each imputation method
plots(df['Age'], fill_dict , 'Age')
No description has been provided for this image
No description has been provided for this image
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning:

The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.

No description has been provided for this image
In [123]:
# before replacing null values
print("Null values (before): ", df["Age"].isnull().sum())

# After replacing null values
df["Age"]=temp_df["rand-many"]
print("Null values (after): ", df["Age"].isnull().sum())
Null values (before):  263
Null values (after):  0
Age Distribution after filling null values :¶
In [146]:
sns.kdeplot(x = df["Age"])
plt.title("Age distribution after replacing null values")
plt.show()
No description has been provided for this image

b. Fare , Individual Fare¶

In [125]:
stats(df, "Fare")
Fare - Numerical Column


Null values are:  1

Null values percentage is:  0.08

Mean is:  33.29547928134557

Median is:  14.4542

Mode is:  8.05

Standard Deviation is:  51.75866823917414

Variance is:  2678.959737892894

Skewness is:  4.367709134122922

Kurtosis is:  27.027986349442294
In [126]:
stats(df , "Individual Fare")
Individual Fare - Numerical Column


Null values are:  1

Null values percentage is:  0.08

Mean is:  20.51821514307558

Median is:  8.512483333333332

Mode is:  13.0

Standard Deviation is:  35.774336893842424

Variance is:  1279.8031801941354

Skewness is:  6.683189172409639

Kurtosis is:  66.46361442187475
In [127]:
df[df["Fare"].isnull()]
Out[127]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title Family Size Family Type Individual Fare Deck Age Group
1043 1044 NaN 3 Storey, Mr. Thomas male 60.5 0 0 3701 NaN NaN S Mr 1 Solo NaN NaN Senior
In [128]:
df[df["Individual Fare"].isnull()]
Out[128]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title Family Size Family Type Individual Fare Deck Age Group
1043 1044 NaN 3 Storey, Mr. Thomas male 60.5 0 0 3701 NaN NaN S Mr 1 Solo NaN NaN Senior
Replacing null value :¶
In [147]:
# before replacing null values

print("Null values in fare (before): ", df["Fare"].isnull().sum())
print("Null values in individual (before): ", df["Individual Fare"].isnull().sum())

# After replacing null values
df["Fare"]= df["Fare"].fillna(df["Fare"].median())
print("Null values in fare (after): ", df["Fare"].isnull().sum())

df["Individual Fare"]= df["Individual Fare"].fillna(df["Individual Fare"].median())
print("Null values in Individual Fare (after): ", df["Individual Fare"].isnull().sum())
Null values in fare (before):  1
Null values in individual (before):  1
Null values in fare (after):  0
Null values in Individual Fare (after):  0
Fare, Individual Fare Distribution after filling null values:¶
In [148]:
sns.kdeplot(data = df,x = df["Fare"], label = "Fare")
plt.title("Fare distribution after replacing null values")
sns.kdeplot(data = df,x = df["Individual Fare"], label = "Individual Fare")
plt.title("Individual Fare distribution after replacing null values")
plt.legend()
plt.show()
No description has been provided for this image

c. Age Group¶

Handling Null Values:¶
In [149]:
fill_dict , temp_df = fill_methods(df, "Age Group")
temp_df["Original"] = df["Age Group"]
temp_df
Out[149]:
mode none unknown rand-cat Original
0 Young Adult Young Adult Young Adult Young Adult Young Adult
1 Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult
2 Young Adult Young Adult Young Adult Young Adult Young Adult
3 Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult
4 Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult
... ... ... ... ... ...
1304 Young Adult None Unknown Middle Aged Adult NaN
1305 Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult
1306 Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult Middle Aged Adult
1307 Young Adult None Unknown Middle Aged Adult NaN
1308 Young Adult None Unknown Middle Aged Adult NaN

1309 rows × 5 columns

In [150]:
plots(df['Age Group'], fill_dict , 'Age Group')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Replacing Null Values:¶
In [151]:
# before replacing null values

print("Null values (before): ", df["Age Group"].isnull().sum())

# After replacing null values
df["Age Group"]=temp_df["mode"]
print("Null values (after): ", df["Age Group"].isnull().sum())
Null values (before):  263
Null values (after):  0
Age group Distribution after filling null values:¶
In [152]:
df["Age Group"].value_counts().plot(kind = "bar")
plt.title("Passenger count across different age groups")
plt.show()
No description has been provided for this image
In [153]:
df["Age Group"].value_counts().plot(kind = "pie", autopct = "%0.1f%%")
plt.title("Passenger Proportion across different age groups")
plt.show()
No description has been provided for this image

d. Deck¶

Handling Null values :¶
In [154]:
df["Deck"] = df["Deck"].astype('category')
fill_dict , temp_df = fill_methods(df, "Deck")
temp_df["Original"] = df["Deck"]
temp_df
Out[154]:
mode none unknown rand-cat Original
0 C None Unknown C NaN
1 C C C C C
2 C None Unknown C NaN
3 C C C C C
4 C None Unknown C NaN
... ... ... ... ... ...
1304 C None Unknown C NaN
1305 C C C C C
1306 C None Unknown C NaN
1307 C None Unknown C NaN
1308 C None Unknown C NaN

1309 rows × 5 columns

In [137]:
plots(df["Deck"],fill_dict,"Deck")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Replacing Null Values:¶
In [155]:
# before replacing null values

print("Null values (before): ", df["Deck"].isnull().sum())

# After replacing null values
df["Deck"]=temp_df["unknown"]
print("Null values (after): ", df["Deck"].isnull().sum())
Null values (before):  1014
Null values (after):  0
Deck Distribution after filling null values:¶
In [156]:
df["Deck"].value_counts().plot(kind = "bar")
plt.title("Passenger count across different Decks")
plt.show()
No description has been provided for this image
In [157]:
df["Deck"].value_counts().plot(kind = "pie" , autopct = "%0.1f%%" )
plt.title("Passenger proportion across different Decks")
plt.show()
No description has been provided for this image

5. Handling Outliers :¶

Detect Outliers in Fare (IQR Method)¶

In [141]:
def outlier_detection_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower) | (df[col] > upper)]

    print(f"Total Fare Outliers Found: {len(outliers)}")
    print(f"Lower Bound: {lower:.2f}, Upper Bound: {upper:.2f}")

    return outliers

outlier_detection_iqr(df, "Fare")
Total Fare Outliers Found: 171
Lower Bound: -27.17, Upper Bound: 66.34
Out[141]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title Family Size Family Type Individual Fare Deck Age Group
1 2 1.0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C Mrs 2 Small 35.641650 C Middle Aged Adult
27 28 0.0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 19950 263.0000 C23 C25 C27 S Mr 6 Medium 43.833333 C Young Adult
31 32 1.0 1 Spencer, Mrs. William Augustus (Marie Eugenie) female 32.0 1 0 PC 17569 146.5208 B78 C Mrs 2 Small 73.260400 B NaN
34 35 0.0 1 Meyer, Mr. Edgar Joseph male 28.0 1 0 PC 17604 82.1708 NaN C Mr 2 Small 41.085400 NaN Young Adult
52 53 1.0 1 Harper, Mrs. Henry Sleeper (Myna Haxtun) female 49.0 1 0 PC 17572 76.7292 D33 C Mrs 2 Small 38.364600 D Senior
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1288 1289 NaN 1 Frolicher-Stehli, Mrs. Maxmillian (Margaretha ... female 48.0 1 1 13567 79.2000 B41 C Mrs 3 Small 26.400000 B Senior
1291 1292 NaN 1 Bonnell, Miss. Caroline female 30.0 0 0 36928 164.8667 C7 S Miss 1 Solo 164.866700 C Young Adult
1298 1299 NaN 1 Widener, Mr. George Dunton male 50.0 1 1 113503 211.5000 C80 C Mr 3 Small 70.500000 C Senior
1302 1303 NaN 1 Minahan, Mrs. William Edward (Lillian E Thorpe) female 37.0 1 0 19928 90.0000 C78 Q Mrs 2 Small 45.000000 C Middle Aged Adult
1305 1306 NaN 1 Oliva y Ocana, Dona. Fermina female 39.0 0 0 PC 17758 108.9000 C105 C Dona 1 Solo 108.900000 C Middle Aged Adult

171 rows × 18 columns

In [142]:
def outlier_plot(df,col):
    plt.figure(figsize=(14,5))

    plt.subplot(1,2,1)
    sns.boxplot(data = df, x=df[col])
    plt.title(f"Boxplot: {col} with Outliers")

    plt.subplot(1,2,2)
    sns.histplot(df[col], kde=True)
    plt.title(f"Distribution of {col}")

    plt.show()
outlier_plot(df,'Fare')
No description has been provided for this image
In [158]:
def capping_outliers(df,col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df[col] = df[col].clip(lower, upper)
    print(f"{col}Outliers Treated Using Capping.")
    return df

capping_outliers(df,"Fare")
FareOutliers Treated Using Capping.
Out[158]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title Family Size Family Type Individual Fare Deck Age Group
0 1 0.0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S Mr 2 Small 3.625000 Unknown Young Adult
1 2 1.0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 66.3438 C85 C Mrs 2 Small 35.641650 C Middle Aged Adult
2 3 1.0 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S Miss 1 Solo 7.925000 Unknown Young Adult
3 4 1.0 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S Mrs 2 Small 26.550000 C Middle Aged Adult
4 5 0.0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S Mr 1 Solo 8.050000 Unknown Middle Aged Adult
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1304 1305 NaN 3 Spector, Mr. Woolf male 36.0 0 0 A.5. 3236 8.0500 NaN S Mr 1 Solo 8.050000 Unknown Young Adult
1305 1306 NaN 1 Oliva y Ocana, Dona. Fermina female 39.0 0 0 PC 17758 66.3438 C105 C Dona 1 Solo 108.900000 C Middle Aged Adult
1306 1307 NaN 3 Saether, Mr. Simon Sivertsen male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S Mr 1 Solo 7.250000 Unknown Middle Aged Adult
1307 1308 NaN 3 Ware, Mr. Frederick male 24.0 0 0 359309 8.0500 NaN S Mr 1 Solo 8.050000 Unknown Young Adult
1308 1309 NaN 3 Peter, Master. Michael J male 33.0 1 1 2668 22.3583 NaN C Master 3 Small 7.452767 Unknown Young Adult

1309 rows × 18 columns

In [159]:
plt.figure(figsize=(14,5))

plt.subplot(1,2,1)
sns.boxplot(data = df , x=df['Fare'])
plt.title(f"Boxplot: Fare without Outliers")

plt.subplot(1,2,2)
sns.histplot(df['Fare'], kde=True)
plt.title(f"Distribution of Fare")

plt.show()
No description has been provided for this image

Univariate Analysis after removing outliers:¶

In [160]:
num_cat_analysis(df, "Fare", "Survived", agg="mean")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

6.Analysis After cleaning Data:¶

a. Bivariate Analysis¶

i. Age:¶

In [163]:
for i in all_cat_columns:
    num_cat_analysis(df, "Age", i, agg="mean")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [165]:
num_num_analysis(df, "Age", "Fare")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

ii. Fare:¶

In [164]:
for i in all_cat_columns:
    num_cat_analysis(df, "Fare", i, agg="mean")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [166]:
num_num_analysis(df, "Fare","Age")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

iii. Individual Fare:¶

In [174]:
for i in all_cat_columns:
    num_cat_analysis(df, "Individual Fare", i, agg="mean")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [175]:
num_num_analysis(df, "Individual Fare","Age")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

iv. Age group :¶

In [170]:
cat_survial(df,"Age Group")
No description has been provided for this image
No description has been provided for this image
In [171]:
pclass_cat_plot(df,"Age Group")
No description has been provided for this image
No description has been provided for this image
In [173]:
temp_df = df.groupby(["Age Group","Sex"]).size().reset_index(name="Count")
fig = px.bar(temp_df, x="Age Group", y='Count' , color = "Sex", title="Count of Male/Female across Age Group" )
fig.update_layout(
    title_x = 0.5,
    title_font = dict(size = 18 , color = "Green"))

b. Multivariate Analysis¶

Age + Sex + Survived :¶
In [176]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Sex", y="Age", hue="Survived")
plt.title("Age Distribution by Sex and Survival")
plt.show()

g = sns.FacetGrid(df, col="Sex", hue="Survived", height=4)
g.map(sns.kdeplot, "Age", fill=True, alpha=0.6)
g.add_legend()
plt.show()
No description has been provided for this image
No description has been provided for this image

Fare + Pclass + Survived:¶

In [177]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="Fare", y="Pclass", hue="Survived", alpha=0.7)
plt.title("Fare vs Pclass by Survival")
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Pclass", y="Fare", hue="Survived")
plt.title("Fare Distribution by Pclass and Survival")
plt.show()
No description has been provided for this image
No description has been provided for this image

Age + Fare + Survived :¶

In [178]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="Age", y="Fare", hue="Survived", alpha=0.7)
plt.title("Age vs Fare by Survival")
plt.show()

sns.jointplot(data=df, x="Age", y="Fare", hue="Survived", kind="kde")
plt.suptitle("KDE Joint Distribution – Age & Fare", y=1.02)
plt.show()
No description has been provided for this image
No description has been provided for this image

Age + Pclass + Survived:¶

In [179]:
age_pclass_survived_plots(df)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

survived + deck + pclass:¶

In [180]:
survived_deck_pclass_plots(df)
No description has been provided for this image
No description has been provided for this image
In [ ]: